#! /usr/bin/env python

try:
    import psyco
    psyco.full()
    print 'psyco activated.'
except:
    pass

import os
import sys
import optparse

def debug(*what):
    print >> sys.stderr, u'[DEBUG]: ', u' '.join(map(unicode, what))

usage = 'usage: %prog [options] dictname'
parser = optparse.OptionParser(usage)
parser.add_option('-s', '--source', dest='source_file', default='phrases.txt', help='specify source dict file.', metavar='FILE')
parser.add_option('-p', '--precise', action='store_true', dest='precise', default=False, help='generate comments on interpreted words only.')
parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='make lots of noice.')
options, args = parser.parse_args()

if len(args) < 1:
    parser.error('missing dictname')
prefix = args[0]

max_word_length = 0
word_map = dict()

keyword_file = open('%s-keywords.txt' % prefix)
for line in keyword_file:
    x = line.strip().decode('utf-8').lstrip(u'\ufeff')
    if not x or x.startswith(u'#'):
        continue
    k, w = x.split(u'\t', 1)
    if w in word_map:
        word_map[w].add(k)
    else:
        word_map[w] = set([k])
    if len(w) > max_word_length:
        max_word_length = len(w)
keyword_file.close()

if options.verbose:
    print 'max-word-length: %d' % max_word_length

source_file = open(options.source_file)
phrase_file = open('%s-phrases.txt' % prefix, 'w')
print >> phrase_file, '# %s phrase file generated by %s' % (prefix, sys.argv[0])

count = 0
phrase = u''
freq = 0

def output_phrase(keywords, words):
    global freq
    delimiter = u'' if all([len(w) == 1 for w in words]) else u' '
    k = u' '.join(keywords)
    p = delimiter.join(words)
    print >> phrase_file, (u'%s\t%d\t%s' % (p, freq, k)).encode('utf-8')

def g(keywords, words, start):
    global count, phrase
    if start == len(phrase):
        output_phrase(keywords, words)
        count += 1
        return
    for i in range(start, min(len(phrase), start + max_word_length)):
        w = phrase[start:i + 1]
        if w in word_map:
            for k in word_map[w]:
                g(keywords + [k], words + [w], i + 1)

for line in source_file:
    x = line.strip().decode('utf-8').lstrip(u'\ufeff')
    if not x or x.startswith(u'#'):
        continue
    phrase, freq_str = x.split(None, 1)
    try:
        freq = int(freq_str)
    except:
        print >> sys.stderr, 'error: invalid format (%s) %s' % (x, options.source_file)
        exit()
    count = 0 
    g([], [], 0)
    if count == 0:
        if options.verbose:
            print 'phrase %s is not interpreted.' % phrase
        if not options.precise:
            print >> phrase_file, (u'# no possible interpretations for %s' % phrase).encode('utf-8')
    elif count > 1:
        print >> phrase_file, (u'# %d possible interpretations for %s' % (count, phrase)).encode('utf-8')

source_file.close()
phrase_file.close()
print '%s-phrases.txt written.' % prefix

